/*
 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
 * All rights reserved.
 *
 * This source code is licensed under both the BSD-style license (found in the
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
 * in the COPYING file in the root directory of this source tree).
 * You may select, at your option, one of the above-listed licenses.
 */

/* zstd_decompress_block :
 * this module takes care of decompressing _compressed_ block */

/*-*******************************************************
 *  Dependencies
 *********************************************************/
#include <string.h>   /* memcpy, memmove, memset */
#include "compiler.h" /* prefetch */
#include "cpu.h"      /* bmi2 */
#include "mem.h"      /* low level memory routines */
#define FSE_STATIC_LINKING_ONLY
#include "fse.h"
#define HUF_STATIC_LINKING_ONLY
#include "huf.h"
#include "zstd_internal.h"
#include "zstd_decompress_internal.h" /* ZSTD_DCtx */
#include "zstd_ddict.h"               /* ZSTD_DDictDictContent */
#include "zstd_decompress_block.h"

/*_*******************************************************
 *  Macros
 **********************************************************/

/* These two optional macros force the use one way or another of the two
 * ZSTD_decompressSequences implementations. You can't force in both directions
 * at the same time.
 */
#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!"
#endif

/*_*******************************************************
 *  Memory operations
 **********************************************************/
static void ZSTD_copy4(void* dst, const void* src)
{
  memcpy(dst, src, 4);
}

/*-*************************************************************
 *   Block decoding
 ***************************************************************/

/*! ZSTD_getcBlockSize() :
 *  Provides the size of compressed block from block header `src` */
size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
{
  if (srcSize < ZSTD_blockHeaderSize)
    return ERROR(srcSize_wrong);
  {
    U32 const cBlockHeader = MEM_readLE24(src);
    U32 const cSize = cBlockHeader >> 3;
    bpPtr->lastBlock = cBlockHeader & 1;
    bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
    bpPtr->origSize = cSize; /* only useful for RLE */
    if (bpPtr->blockType == bt_rle)
      return 1;
    if (bpPtr->blockType == bt_reserved)
      return ERROR(corruption_detected);
    return cSize;
  }
}

/* Hidden declaration for fullbench */
size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, const void* src, size_t srcSize);
/*! ZSTD_decodeLiteralsBlock() :
 * @return : nb of bytes read from src (< srcSize )
 *  note : symbol not declared but exposed for fullbench */
size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
{
  if (srcSize < MIN_CBLOCK_SIZE)
    return ERROR(corruption_detected);

  {
    const BYTE* const istart = (const BYTE*)src;
    symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);

    switch (litEncType) {
      case set_repeat:
        if (dctx->litEntropy == 0)
          return ERROR(dictionary_corrupted);
        /* fall-through */

      case set_compressed:
        if (srcSize < 5)
          return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
        {
          size_t lhSize, litSize, litCSize;
          U32 singleStream = 0;
          U32 const lhlCode = (istart[0] >> 2) & 3;
          U32 const lhc = MEM_readLE32(istart);
          size_t hufSuccess;
          switch (lhlCode) {
            case 0:
            case 1:
            default: /* note : default is impossible, since lhlCode into [0..3] */
              /* 2 - 2 - 10 - 10 */
              singleStream = !lhlCode;
              lhSize = 3;
              litSize = (lhc >> 4) & 0x3FF;
              litCSize = (lhc >> 14) & 0x3FF;
              break;
            case 2:
              /* 2 - 2 - 14 - 14 */
              lhSize = 4;
              litSize = (lhc >> 4) & 0x3FFF;
              litCSize = lhc >> 18;
              break;
            case 3:
              /* 2 - 2 - 18 - 18 */
              lhSize = 5;
              litSize = (lhc >> 4) & 0x3FFFF;
              litCSize = (lhc >> 22) + (istart[4] << 10);
              break;
          }
          if (litSize > ZSTD_BLOCKSIZE_MAX)
            return ERROR(corruption_detected);
          if (litCSize + lhSize > srcSize)
            return ERROR(corruption_detected);

          /* prefetch huffman table if cold */
          if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
            PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
          }

          if (litEncType == set_repeat) {
            if (singleStream) {
              hufSuccess = HUF_decompress1X_usingDTable_bmi2(
                  dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr, dctx->bmi2);
            } else {
              hufSuccess = HUF_decompress4X_usingDTable_bmi2(
                  dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr, dctx->bmi2);
            }
          } else {
            if (singleStream) {
#if defined(HUF_FORCE_DECOMPRESS_X2)
              hufSuccess = HUF_decompress1X_DCtx_wksp(dctx->entropy.hufTable,
                  dctx->litBuffer,
                  litSize,
                  istart + lhSize,
                  litCSize,
                  dctx->workspace,
                  sizeof(dctx->workspace));
#else
              hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(dctx->entropy.hufTable,
                  dctx->litBuffer,
                  litSize,
                  istart + lhSize,
                  litCSize,
                  dctx->workspace,
                  sizeof(dctx->workspace),
                  dctx->bmi2);
#endif
            } else {
              hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(dctx->entropy.hufTable,
                  dctx->litBuffer,
                  litSize,
                  istart + lhSize,
                  litCSize,
                  dctx->workspace,
                  sizeof(dctx->workspace),
                  dctx->bmi2);
            }
          }

          if (HUF_isError(hufSuccess))
            return ERROR(corruption_detected);

          dctx->litPtr = dctx->litBuffer;
          dctx->litSize = litSize;
          dctx->litEntropy = 1;
          if (litEncType == set_compressed)
            dctx->HUFptr = dctx->entropy.hufTable;
          memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
          return litCSize + lhSize;
        }

      case set_basic: {
        size_t litSize, lhSize;
        U32 const lhlCode = ((istart[0]) >> 2) & 3;
        switch (lhlCode) {
          case 0:
          case 2:
          default: /* note : default is impossible, since lhlCode into [0..3] */
            lhSize = 1;
            litSize = istart[0] >> 3;
            break;
          case 1:
            lhSize = 2;
            litSize = MEM_readLE16(istart) >> 4;
            break;
          case 3:
            lhSize = 3;
            litSize = MEM_readLE24(istart) >> 4;
            break;
        }

        if (lhSize + litSize + WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
          if (litSize + lhSize > srcSize)
            return ERROR(corruption_detected);
          memcpy(dctx->litBuffer, istart + lhSize, litSize);
          dctx->litPtr = dctx->litBuffer;
          dctx->litSize = litSize;
          memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
          return lhSize + litSize;
        }
        /* direct reference into compressed stream */
        dctx->litPtr = istart + lhSize;
        dctx->litSize = litSize;
        return lhSize + litSize;
      }

      case set_rle: {
        U32 const lhlCode = ((istart[0]) >> 2) & 3;
        size_t litSize, lhSize;
        switch (lhlCode) {
          case 0:
          case 2:
          default: /* note : default is impossible, since lhlCode into [0..3] */
            lhSize = 1;
            litSize = istart[0] >> 3;
            break;
          case 1:
            lhSize = 2;
            litSize = MEM_readLE16(istart) >> 4;
            break;
          case 3:
            lhSize = 3;
            litSize = MEM_readLE24(istart) >> 4;
            if (srcSize < 4)
              return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
            break;
        }
        if (litSize > ZSTD_BLOCKSIZE_MAX)
          return ERROR(corruption_detected);
        memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
        dctx->litPtr = dctx->litBuffer;
        dctx->litSize = litSize;
        return lhSize + 1;
      }
      default:
        return ERROR(corruption_detected); /* impossible */
    }
  }
}

/* Default FSE distribution tables.
 * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
 * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
 * They were generated programmatically with following method :
 * - start from default distributions, present in /lib/common/zstd_internal.h
 * - generate tables normally, using ZSTD_buildFSETable()
 * - printout the content of tables
 * - pretify output, report below, test with fuzzer to ensure it's correct */

/* Default FSE distribution table for Literal Lengths */
static const ZSTD_seqSymbol LL_defaultDTable[(1 << LL_DEFAULTNORMLOG) + 1] = {
    {1, 1, 1, LL_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
    /* nextState, nbAddBits, nbBits, baseVal */
    {0, 0, 4, 0},
    {16, 0, 4, 0},
    {32, 0, 5, 1},
    {0, 0, 5, 3},
    {0, 0, 5, 4},
    {0, 0, 5, 6},
    {0, 0, 5, 7},
    {0, 0, 5, 9},
    {0, 0, 5, 10},
    {0, 0, 5, 12},
    {0, 0, 6, 14},
    {0, 1, 5, 16},
    {0, 1, 5, 20},
    {0, 1, 5, 22},
    {0, 2, 5, 28},
    {0, 3, 5, 32},
    {0, 4, 5, 48},
    {32, 6, 5, 64},
    {0, 7, 5, 128},
    {0, 8, 6, 256},
    {0, 10, 6, 1024},
    {0, 12, 6, 4096},
    {32, 0, 4, 0},
    {0, 0, 4, 1},
    {0, 0, 5, 2},
    {32, 0, 5, 4},
    {0, 0, 5, 5},
    {32, 0, 5, 7},
    {0, 0, 5, 8},
    {32, 0, 5, 10},
    {0, 0, 5, 11},
    {0, 0, 6, 13},
    {32, 1, 5, 16},
    {0, 1, 5, 18},
    {32, 1, 5, 22},
    {0, 2, 5, 24},
    {32, 3, 5, 32},
    {0, 3, 5, 40},
    {0, 6, 4, 64},
    {16, 6, 4, 64},
    {32, 7, 5, 128},
    {0, 9, 6, 512},
    {0, 11, 6, 2048},
    {48, 0, 4, 0},
    {16, 0, 4, 1},
    {32, 0, 5, 2},
    {32, 0, 5, 3},
    {32, 0, 5, 5},
    {32, 0, 5, 6},
    {32, 0, 5, 8},
    {32, 0, 5, 9},
    {32, 0, 5, 11},
    {32, 0, 5, 12},
    {0, 0, 6, 15},
    {32, 1, 5, 18},
    {32, 1, 5, 20},
    {32, 2, 5, 24},
    {32, 2, 5, 28},
    {32, 3, 5, 40},
    {32, 4, 5, 48},
    {0, 16, 6, 65536},
    {0, 15, 6, 32768},
    {0, 14, 6, 16384},
    {0, 13, 6, 8192},
}; /* LL_defaultDTable */

/* Default FSE distribution table for Offset Codes */
static const ZSTD_seqSymbol OF_defaultDTable[(1 << OF_DEFAULTNORMLOG) + 1] = {
    {1, 1, 1, OF_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
    /* nextState, nbAddBits, nbBits, baseVal */
    {0, 0, 5, 0},
    {0, 6, 4, 61},
    {0, 9, 5, 509},
    {0, 15, 5, 32765},
    {0, 21, 5, 2097149},
    {0, 3, 5, 5},
    {0, 7, 4, 125},
    {0, 12, 5, 4093},
    {0, 18, 5, 262141},
    {0, 23, 5, 8388605},
    {0, 5, 5, 29},
    {0, 8, 4, 253},
    {0, 14, 5, 16381},
    {0, 20, 5, 1048573},
    {0, 2, 5, 1},
    {16, 7, 4, 125},
    {0, 11, 5, 2045},
    {0, 17, 5, 131069},
    {0, 22, 5, 4194301},
    {0, 4, 5, 13},
    {16, 8, 4, 253},
    {0, 13, 5, 8189},
    {0, 19, 5, 524285},
    {0, 1, 5, 1},
    {16, 6, 4, 61},
    {0, 10, 5, 1021},
    {0, 16, 5, 65533},
    {0, 28, 5, 268435453},
    {0, 27, 5, 134217725},
    {0, 26, 5, 67108861},
    {0, 25, 5, 33554429},
    {0, 24, 5, 16777213},
}; /* OF_defaultDTable */

/* Default FSE distribution table for Match Lengths */
static const ZSTD_seqSymbol ML_defaultDTable[(1 << ML_DEFAULTNORMLOG) + 1] = {
    {1, 1, 1, ML_DEFAULTNORMLOG}, /* header : fastMode, tableLog */
    /* nextState, nbAddBits, nbBits, baseVal */
    {0, 0, 6, 3},
    {0, 0, 4, 4},
    {32, 0, 5, 5},
    {0, 0, 5, 6},
    {0, 0, 5, 8},
    {0, 0, 5, 9},
    {0, 0, 5, 11},
    {0, 0, 6, 13},
    {0, 0, 6, 16},
    {0, 0, 6, 19},
    {0, 0, 6, 22},
    {0, 0, 6, 25},
    {0, 0, 6, 28},
    {0, 0, 6, 31},
    {0, 0, 6, 34},
    {0, 1, 6, 37},
    {0, 1, 6, 41},
    {0, 2, 6, 47},
    {0, 3, 6, 59},
    {0, 4, 6, 83},
    {0, 7, 6, 131},
    {0, 9, 6, 515},
    {16, 0, 4, 4},
    {0, 0, 4, 5},
    {32, 0, 5, 6},
    {0, 0, 5, 7},
    {32, 0, 5, 9},
    {0, 0, 5, 10},
    {0, 0, 6, 12},
    {0, 0, 6, 15},
    {0, 0, 6, 18},
    {0, 0, 6, 21},
    {0, 0, 6, 24},
    {0, 0, 6, 27},
    {0, 0, 6, 30},
    {0, 0, 6, 33},
    {0, 1, 6, 35},
    {0, 1, 6, 39},
    {0, 2, 6, 43},
    {0, 3, 6, 51},
    {0, 4, 6, 67},
    {0, 5, 6, 99},
    {0, 8, 6, 259},
    {32, 0, 4, 4},
    {48, 0, 4, 4},
    {16, 0, 4, 5},
    {32, 0, 5, 7},
    {32, 0, 5, 8},
    {32, 0, 5, 10},
    {32, 0, 5, 11},
    {0, 0, 6, 14},
    {0, 0, 6, 17},
    {0, 0, 6, 20},
    {0, 0, 6, 23},
    {0, 0, 6, 26},
    {0, 0, 6, 29},
    {0, 0, 6, 32},
    {0, 16, 6, 65539},
    {0, 15, 6, 32771},
    {0, 14, 6, 16387},
    {0, 13, 6, 8195},
    {0, 12, 6, 4099},
    {0, 11, 6, 2051},
    {0, 10, 6, 1027},
}; /* ML_defaultDTable */

static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
{
  void* ptr = dt;
  ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
  ZSTD_seqSymbol* const cell = dt + 1;

  DTableH->tableLog = 0;
  DTableH->fastMode = 0;

  cell->nbBits = 0;
  cell->nextState = 0;
  assert(nbAddBits < 255);
  cell->nbAdditionalBits = (BYTE)nbAddBits;
  cell->baseValue = baseValue;
}

/* ZSTD_buildFSETable() :
 * generate FSE decoding table for one symbol (ll, ml or off)
 * cannot fail if input is valid =>
 * all inputs are presumed validated at this stage */
void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue,
    const U32* baseValue, const U32* nbAdditionalBits, unsigned tableLog)
{
  ZSTD_seqSymbol* const tableDecode = dt + 1;
  U16 symbolNext[MaxSeq + 1];

  U32 const maxSV1 = maxSymbolValue + 1;
  U32 const tableSize = 1 << tableLog;
  U32 highThreshold = tableSize - 1;

  /* Sanity Checks */
  assert(maxSymbolValue <= MaxSeq);
  assert(tableLog <= MaxFSELog);

  /* Init, lay down lowprob symbols */
  {
    ZSTD_seqSymbol_header DTableH;
    DTableH.tableLog = tableLog;
    DTableH.fastMode = 1;
    {
      S16 const largeLimit = (S16)(1 << (tableLog - 1));
      U32 s;
      for (s = 0; s < maxSV1; s++) {
        if (normalizedCounter[s] == -1) {
          tableDecode[highThreshold--].baseValue = s;
          symbolNext[s] = 1;
        } else {
          if (normalizedCounter[s] >= largeLimit)
            DTableH.fastMode = 0;
          symbolNext[s] = normalizedCounter[s];
        }
      }
    }
    memcpy(dt, &DTableH, sizeof(DTableH));
  }

  /* Spread symbols */
  {
    U32 const tableMask = tableSize - 1;
    U32 const step = FSE_TABLESTEP(tableSize);
    U32 s, position = 0;
    for (s = 0; s < maxSV1; s++) {
      int i;
      for (i = 0; i < normalizedCounter[s]; i++) {
        tableDecode[position].baseValue = s;
        position = (position + step) & tableMask;
        while (position > highThreshold)
          position = (position + step) & tableMask; /* lowprob area */
      }
    }
    assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
  }

  /* Build Decoding table */
  {
    U32 u;
    for (u = 0; u < tableSize; u++) {
      U32 const symbol = tableDecode[u].baseValue;
      U32 const nextState = symbolNext[symbol]++;
      tableDecode[u].nbBits = (BYTE)(tableLog - BIT_highbit32(nextState));
      tableDecode[u].nextState = (U16)((nextState << tableDecode[u].nbBits) - tableSize);
      assert(nbAdditionalBits[symbol] < 255);
      tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
      tableDecode[u].baseValue = baseValue[symbol];
    }
  }
}

/*! ZSTD_buildSeqTable() :
 * @return : nb bytes read from src,
 *           or an error code if it fails */
static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
    symbolEncodingType_e type, unsigned max, U32 maxLog, const void* src, size_t srcSize, const U32* baseValue,
    const U32* nbAdditionalBits, const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, int ddictIsCold, int nbSeq)
{
  switch (type) {
    case set_rle:
      if (!srcSize)
        return ERROR(srcSize_wrong);
      if ((*(const BYTE*)src) > max)
        return ERROR(corruption_detected);
      {
        U32 const symbol = *(const BYTE*)src;
        U32 const baseline = baseValue[symbol];
        U32 const nbBits = nbAdditionalBits[symbol];
        ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
      }
      *DTablePtr = DTableSpace;
      return 1;
    case set_basic:
      *DTablePtr = defaultTable;
      return 0;
    case set_repeat:
      if (!flagRepeatTable)
        return ERROR(corruption_detected);
      /* prefetch FSE table if used */
      if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
        const void* const pStart = *DTablePtr;
        size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
        PREFETCH_AREA(pStart, pSize);
      }
      return 0;
    case set_compressed: {
      unsigned tableLog;
      S16 norm[MaxSeq + 1];
      size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
      if (FSE_isError(headerSize))
        return ERROR(corruption_detected);
      if (tableLog > maxLog)
        return ERROR(corruption_detected);
      ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
      *DTablePtr = DTableSpace;
      return headerSize;
    }
    default: /* impossible */
      assert(0);
      return ERROR(GENERIC);
  }
}

size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize)
{
  const BYTE* const istart = (const BYTE* const)src;
  const BYTE* const iend = istart + srcSize;
  const BYTE* ip = istart;
  int nbSeq;
  DEBUGLOG(5, "ZSTD_decodeSeqHeaders");

  /* check */
  if (srcSize < MIN_SEQUENCES_SIZE)
    return ERROR(srcSize_wrong);

  /* SeqHead */
  nbSeq = *ip++;
  if (!nbSeq) {
    *nbSeqPtr = 0;
    if (srcSize != 1)
      return ERROR(srcSize_wrong);
    return 1;
  }
  if (nbSeq > 0x7F) {
    if (nbSeq == 0xFF) {
      if (ip + 2 > iend)
        return ERROR(srcSize_wrong);
      nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip += 2;
    } else {
      if (ip >= iend)
        return ERROR(srcSize_wrong);
      nbSeq = ((nbSeq - 0x80) << 8) + *ip++;
    }
  }
  *nbSeqPtr = nbSeq;

  /* FSE table descriptors */
  if (ip + 4 > iend)
    return ERROR(srcSize_wrong); /* minimum possible size */
  {
    symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
    symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
    symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
    ip++;

    /* Build DTables */
    {
      size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable,
          &dctx->LLTptr,
          LLtype,
          MaxLL,
          LLFSELog,
          ip,
          iend - ip,
          LL_base,
          LL_bits,
          LL_defaultDTable,
          dctx->fseEntropy,
          dctx->ddictIsCold,
          nbSeq);
      if (ZSTD_isError(llhSize))
        return ERROR(corruption_detected);
      ip += llhSize;
    }

    {
      size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable,
          &dctx->OFTptr,
          OFtype,
          MaxOff,
          OffFSELog,
          ip,
          iend - ip,
          OF_base,
          OF_bits,
          OF_defaultDTable,
          dctx->fseEntropy,
          dctx->ddictIsCold,
          nbSeq);
      if (ZSTD_isError(ofhSize))
        return ERROR(corruption_detected);
      ip += ofhSize;
    }

    {
      size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable,
          &dctx->MLTptr,
          MLtype,
          MaxML,
          MLFSELog,
          ip,
          iend - ip,
          ML_base,
          ML_bits,
          ML_defaultDTable,
          dctx->fseEntropy,
          dctx->ddictIsCold,
          nbSeq);
      if (ZSTD_isError(mlhSize))
        return ERROR(corruption_detected);
      ip += mlhSize;
    }
  }

  return ip - istart;
}

typedef struct {
  size_t litLength;
  size_t matchLength;
  size_t offset;
  const BYTE* match;
} seq_t;

typedef struct {
  size_t state;
  const ZSTD_seqSymbol* table;
} ZSTD_fseState;

typedef struct {
  BIT_DStream_t DStream;
  ZSTD_fseState stateLL;
  ZSTD_fseState stateOffb;
  ZSTD_fseState stateML;
  size_t prevOffset[ZSTD_REP_NUM];
  const BYTE* prefixStart;
  const BYTE* dictEnd;
  size_t pos;
} seqState_t;

/* ZSTD_execSequenceLast7():
 * exceptional case : decompress a match starting within last 7 bytes of output buffer.
 * requires more careful checks, to ensure there is no overflow.
 * performance does not matter though.
 * note : this case is supposed to be never generated "naturally" by reference encoder,
 *        since in most cases it needs at least 8 bytes to look for a match.
 *        but it's allowed by the specification. */
FORCE_NOINLINE
size_t ZSTD_execSequenceLast7(BYTE* op, BYTE* const oend, seq_t sequence, const BYTE** litPtr,
    const BYTE* const litLimit, const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
{
  BYTE* const oLitEnd = op + sequence.litLength;
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
  BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
  const BYTE* match = oLitEnd - sequence.offset;

  /* check */
  if (oMatchEnd > oend)
    return ERROR(dstSize_tooSmall); /* last match must fit within dstBuffer */
  if (iLitEnd > litLimit)
    return ERROR(corruption_detected); /* try to read beyond literal buffer */

  /* copy literals */
  while (op < oLitEnd)
    *op++ = *(*litPtr)++;

  /* copy Match */
  if (sequence.offset > (size_t)(oLitEnd - base)) {
    /* offset beyond prefix */
    if (sequence.offset > (size_t)(oLitEnd - vBase))
      return ERROR(corruption_detected);
    match = dictEnd - (base - match);
    if (match + sequence.matchLength <= dictEnd) {
      memmove(oLitEnd, match, sequence.matchLength);
      return sequenceLength;
    }
    /* span extDict & currentPrefixSegment */
    {
      size_t const length1 = dictEnd - match;
      memmove(oLitEnd, match, length1);
      op = oLitEnd + length1;
      sequence.matchLength -= length1;
      match = base;
    }
  }
  while (op < oMatchEnd)
    *op++ = *match++;
  return sequenceLength;
}

HINT_INLINE
size_t ZSTD_execSequence(BYTE* op, BYTE* const oend, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit,
    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
{
  BYTE* const oLitEnd = op + sequence.litLength;
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
  BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
  BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
  const BYTE* match = oLitEnd - sequence.offset;

  /* check */
  if (oMatchEnd > oend)
    return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
  if (iLitEnd > litLimit)
    return ERROR(corruption_detected); /* over-read beyond lit buffer */
  if (oLitEnd > oend_w)
    return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);

  /* copy Literals */
  ZSTD_copy8(op, *litPtr);
  if (sequence.litLength > 8)
    ZSTD_wildcopy(op + 8,
        (*litPtr) + 8,
        sequence.litLength -
            8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
  op = oLitEnd;
  *litPtr = iLitEnd; /* update for next sequence */

  /* copy Match */
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
    /* offset beyond prefix -> go into extDict */
    if (sequence.offset > (size_t)(oLitEnd - virtualStart))
      return ERROR(corruption_detected);
    match = dictEnd + (match - prefixStart);
    if (match + sequence.matchLength <= dictEnd) {
      memmove(oLitEnd, match, sequence.matchLength);
      return sequenceLength;
    }
    /* span extDict & currentPrefixSegment */
    {
      size_t const length1 = dictEnd - match;
      memmove(oLitEnd, match, length1);
      op = oLitEnd + length1;
      sequence.matchLength -= length1;
      match = prefixStart;
      if (op > oend_w || sequence.matchLength < MINMATCH) {
        U32 i;
        for (i = 0; i < sequence.matchLength; ++i)
          op[i] = match[i];
        return sequenceLength;
      }
    }
  }
  /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */

  /* match within prefix */
  if (sequence.offset < 8) {
    /* close range match, overlap */
    static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
    static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */
    int const sub2 = dec64table[sequence.offset];
    op[0] = match[0];
    op[1] = match[1];
    op[2] = match[2];
    op[3] = match[3];
    match += dec32table[sequence.offset];
    ZSTD_copy4(op + 4, match);
    match -= sub2;
  } else {
    ZSTD_copy8(op, match);
  }
  op += 8;
  match += 8;

  if (oMatchEnd > oend - (16 - MINMATCH)) {
    if (op < oend_w) {
      ZSTD_wildcopy(op, match, oend_w - op);
      match += oend_w - op;
      op = oend_w;
    }
    while (op < oMatchEnd)
      *op++ = *match++;
  } else {
    ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */
  }
  return sequenceLength;
}

HINT_INLINE
size_t ZSTD_execSequenceLong(BYTE* op, BYTE* const oend, seq_t sequence, const BYTE** litPtr,
    const BYTE* const litLimit, const BYTE* const prefixStart, const BYTE* const dictStart, const BYTE* const dictEnd)
{
  BYTE* const oLitEnd = op + sequence.litLength;
  size_t const sequenceLength = sequence.litLength + sequence.matchLength;
  BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
  BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
  const BYTE* const iLitEnd = *litPtr + sequence.litLength;
  const BYTE* match = sequence.match;

  /* check */
  if (oMatchEnd > oend)
    return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
  if (iLitEnd > litLimit)
    return ERROR(corruption_detected); /* over-read beyond lit buffer */
  if (oLitEnd > oend_w)
    return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, dictStart, dictEnd);

  /* copy Literals */
  ZSTD_copy8(op, *litPtr); /* note : op <= oLitEnd <= oend_w == oend - 8 */
  if (sequence.litLength > 8)
    ZSTD_wildcopy(op + 8,
        (*litPtr) + 8,
        sequence.litLength -
            8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
  op = oLitEnd;
  *litPtr = iLitEnd; /* update for next sequence */

  /* copy Match */
  if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
    /* offset beyond prefix */
    if (sequence.offset > (size_t)(oLitEnd - dictStart))
      return ERROR(corruption_detected);
    if (match + sequence.matchLength <= dictEnd) {
      memmove(oLitEnd, match, sequence.matchLength);
      return sequenceLength;
    }
    /* span extDict & currentPrefixSegment */
    {
      size_t const length1 = dictEnd - match;
      memmove(oLitEnd, match, length1);
      op = oLitEnd + length1;
      sequence.matchLength -= length1;
      match = prefixStart;
      if (op > oend_w || sequence.matchLength < MINMATCH) {
        U32 i;
        for (i = 0; i < sequence.matchLength; ++i)
          op[i] = match[i];
        return sequenceLength;
      }
    }
  }
  assert(op <= oend_w);
  assert(sequence.matchLength >= MINMATCH);

  /* match within prefix */
  if (sequence.offset < 8) {
    /* close range match, overlap */
    static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
    static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */
    int const sub2 = dec64table[sequence.offset];
    op[0] = match[0];
    op[1] = match[1];
    op[2] = match[2];
    op[3] = match[3];
    match += dec32table[sequence.offset];
    ZSTD_copy4(op + 4, match);
    match -= sub2;
  } else {
    ZSTD_copy8(op, match);
  }
  op += 8;
  match += 8;

  if (oMatchEnd > oend - (16 - MINMATCH)) {
    if (op < oend_w) {
      ZSTD_wildcopy(op, match, oend_w - op);
      match += oend_w - op;
      op = oend_w;
    }
    while (op < oMatchEnd)
      *op++ = *match++;
  } else {
    ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */
  }
  return sequenceLength;
}

static void ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
{
  const void* ptr = dt;
  const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr;
  DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
  DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits", (U32)DStatePtr->state, DTableH->tableLog);
  BIT_reloadDStream(bitD);
  DStatePtr->table = dt + 1;
}

FORCE_INLINE_TEMPLATE void ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
{
  ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
  U32 const nbBits = DInfo.nbBits;
  size_t const lowBits = BIT_readBits(bitD, nbBits);
  DStatePtr->state = DInfo.nextState + lowBits;
}

/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
 * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
 * bits before reloading. This value is the maximum number of bytes we read
 * after reloading when we are decoding long offets.
 */
#define LONG_OFFSETS_MAX_EXTRA_BITS_32 \
  (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 : 0)

typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset = 1 } ZSTD_longOffset_e;

#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
FORCE_INLINE_TEMPLATE seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
{
  seq_t seq;
  U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
  U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
  U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
  U32 const totalBits = llBits + mlBits + ofBits;
  U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
  U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
  U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;

  /* sequence */
  {
    size_t offset;
    if (!ofBits)
      offset = 0;
    else {
      ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
      ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
      assert(ofBits <= MaxOff);
      if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
        U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
        offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
        BIT_reloadDStream(&seqState->DStream);
        if (extraBits)
          offset += BIT_readBitsFast(&seqState->DStream, extraBits);
        assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
      } else {
        offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits /*>0*/); /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
        if (MEM_32bits())
          BIT_reloadDStream(&seqState->DStream);
      }
    }

    if (ofBits <= 1) {
      offset += (llBase == 0);
      if (offset) {
        size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
        temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
        if (offset != 1)
          seqState->prevOffset[2] = seqState->prevOffset[1];
        seqState->prevOffset[1] = seqState->prevOffset[0];
        seqState->prevOffset[0] = offset = temp;
      } else { /* offset == 0 */
        offset = seqState->prevOffset[0];
      }
    } else {
      seqState->prevOffset[2] = seqState->prevOffset[1];
      seqState->prevOffset[1] = seqState->prevOffset[0];
      seqState->prevOffset[0] = offset;
    }
    seq.offset = offset;
  }

  seq.matchLength = mlBase + ((mlBits > 0) ? BIT_readBitsFast(&seqState->DStream, mlBits /*>0*/) : 0); /* <=  16 bits */
  if (MEM_32bits() && (mlBits + llBits >= STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32))
    BIT_reloadDStream(&seqState->DStream);
  if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64 - (LLFSELog + MLFSELog + OffFSELog)))
    BIT_reloadDStream(&seqState->DStream);
  /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
  ZSTD_STATIC_ASSERT(16 + LLFSELog + MLFSELog + OffFSELog < STREAM_ACCUMULATOR_MIN_64);

  seq.litLength = llBase + ((llBits > 0) ? BIT_readBitsFast(&seqState->DStream, llBits /*>0*/) : 0); /* <=  16 bits */
  if (MEM_32bits())
    BIT_reloadDStream(&seqState->DStream);

  DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);

  /* ANS state update */
  ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <=  9 bits */
  ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <=  9 bits */
  if (MEM_32bits())
    BIT_reloadDStream(&seqState->DStream);                       /* <= 18 bits */
  ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <=  8 bits */

  return seq;
}

FORCE_INLINE_TEMPLATE size_t ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
    const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset)
{
  const BYTE* ip = (const BYTE*)seqStart;
  const BYTE* const iend = ip + seqSize;
  BYTE* const ostart = (BYTE* const)dst;
  BYTE* const oend = ostart + maxDstSize;
  BYTE* op = ostart;
  const BYTE* litPtr = dctx->litPtr;
  const BYTE* const litEnd = litPtr + dctx->litSize;
  const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
  const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
  const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
  DEBUGLOG(5, "ZSTD_decompressSequences_body");

  /* Regen sequences */
  if (nbSeq) {
    seqState_t seqState;
    dctx->fseEntropy = 1;
    {
      U32 i;
      for (i = 0; i < ZSTD_REP_NUM; i++)
        seqState.prevOffset[i] = dctx->entropy.rep[i];
    }
    CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend - ip), corruption_detected);
    ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
    ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
    ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);

    for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq;) {
      nbSeq--;
      {
        seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
        size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
        DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
        if (ZSTD_isError(oneSeqSize))
          return oneSeqSize;
        op += oneSeqSize;
      }
    }

    /* check if reached exact end */
    DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
    if (nbSeq)
      return ERROR(corruption_detected);
    /* save reps for next block */
    {
      U32 i;
      for (i = 0; i < ZSTD_REP_NUM; i++)
        dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]);
    }
  }

  /* last literal segment */
  {
    size_t const lastLLSize = litEnd - litPtr;
    if (lastLLSize > (size_t)(oend - op))
      return ERROR(dstSize_tooSmall);
    memcpy(op, litPtr, lastLLSize);
    op += lastLLSize;
  }

  return op - ostart;
}

static size_t ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart,
    size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset)
{
  return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
}
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */

#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
FORCE_INLINE_TEMPLATE seq_t ZSTD_decodeSequenceLong(seqState_t* seqState, ZSTD_longOffset_e const longOffsets)
{
  seq_t seq;
  U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
  U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
  U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
  U32 const totalBits = llBits + mlBits + ofBits;
  U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
  U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
  U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;

  /* sequence */
  {
    size_t offset;
    if (!ofBits)
      offset = 0;
    else {
      ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
      ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
      assert(ofBits <= MaxOff);
      if (MEM_32bits() && longOffsets) {
        U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32 - 1);
        offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
        if (MEM_32bits() || extraBits)
          BIT_reloadDStream(&seqState->DStream);
        if (extraBits)
          offset += BIT_readBitsFast(&seqState->DStream, extraBits);
      } else {
        offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits); /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
        if (MEM_32bits())
          BIT_reloadDStream(&seqState->DStream);
      }
    }

    if (ofBits <= 1) {
      offset += (llBase == 0);
      if (offset) {
        size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
        temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
        if (offset != 1)
          seqState->prevOffset[2] = seqState->prevOffset[1];
        seqState->prevOffset[1] = seqState->prevOffset[0];
        seqState->prevOffset[0] = offset = temp;
      } else {
        offset = seqState->prevOffset[0];
      }
    } else {
      seqState->prevOffset[2] = seqState->prevOffset[1];
      seqState->prevOffset[1] = seqState->prevOffset[0];
      seqState->prevOffset[0] = offset;
    }
    seq.offset = offset;
  }

  seq.matchLength = mlBase + ((mlBits > 0) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <=  16 bits */
  if (MEM_32bits() && (mlBits + llBits >= STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32))
    BIT_reloadDStream(&seqState->DStream);
  if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64 - (LLFSELog + MLFSELog + OffFSELog)))
    BIT_reloadDStream(&seqState->DStream);
  /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
  ZSTD_STATIC_ASSERT(16 + LLFSELog + MLFSELog + OffFSELog < STREAM_ACCUMULATOR_MIN_64);

  seq.litLength = llBase + ((llBits > 0) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <=  16 bits */
  if (MEM_32bits())
    BIT_reloadDStream(&seqState->DStream);

  {
    size_t const pos = seqState->pos + seq.litLength;
    const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
    seq.match = matchBase + pos -
                seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only
                             * happen when input is corrupted. No consequence though : no memory access will occur,
                             * overly large offset will be detected in ZSTD_execSequenceLong() */
    seqState->pos = pos + seq.matchLength;
  }

  /* ANS state update */
  ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <=  9 bits */
  ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <=  9 bits */
  if (MEM_32bits())
    BIT_reloadDStream(&seqState->DStream);                       /* <= 18 bits */
  ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <=  8 bits */

  return seq;
}

FORCE_INLINE_TEMPLATE size_t ZSTD_decompressSequencesLong_body(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
    const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset)
{
  const BYTE* ip = (const BYTE*)seqStart;
  const BYTE* const iend = ip + seqSize;
  BYTE* const ostart = (BYTE* const)dst;
  BYTE* const oend = ostart + maxDstSize;
  BYTE* op = ostart;
  const BYTE* litPtr = dctx->litPtr;
  const BYTE* const litEnd = litPtr + dctx->litSize;
  const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
  const BYTE* const dictStart = (const BYTE*)(dctx->virtualStart);
  const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);

  /* Regen sequences */
  if (nbSeq) {
#define STORED_SEQS 4
#define STORED_SEQS_MASK (STORED_SEQS - 1)
#define ADVANCED_SEQS 4
    seq_t sequences[STORED_SEQS];
    int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
    seqState_t seqState;
    int seqNb;
    dctx->fseEntropy = 1;
    {
      int i;
      for (i = 0; i < ZSTD_REP_NUM; i++)
        seqState.prevOffset[i] = dctx->entropy.rep[i];
    }
    seqState.prefixStart = prefixStart;
    seqState.pos = (size_t)(op - prefixStart);
    seqState.dictEnd = dictEnd;
    assert(iend >= ip);
    CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend - ip), corruption_detected);
    ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
    ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
    ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);

    /* prepare in advance */
    for (seqNb = 0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb < seqAdvance); seqNb++) {
      sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
      PREFETCH_L1(sequences[seqNb].match);
      PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength -
                  1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
    }
    if (seqNb < seqAdvance)
      return ERROR(corruption_detected);

    /* decode and decompress */
    for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
      seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
      size_t const oneSeqSize = ZSTD_execSequenceLong(op,
          oend,
          sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK],
          &litPtr,
          litEnd,
          prefixStart,
          dictStart,
          dictEnd);
      if (ZSTD_isError(oneSeqSize))
        return oneSeqSize;
      PREFETCH_L1(sequence.match);
      PREFETCH_L1(sequence.match + sequence.matchLength -
                  1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
      sequences[seqNb & STORED_SEQS_MASK] = sequence;
      op += oneSeqSize;
    }
    if (seqNb < nbSeq)
      return ERROR(corruption_detected);

    /* finish queue */
    seqNb -= seqAdvance;
    for (; seqNb < nbSeq; seqNb++) {
      size_t const oneSeqSize = ZSTD_execSequenceLong(
          op, oend, sequences[seqNb & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
      if (ZSTD_isError(oneSeqSize))
        return oneSeqSize;
      op += oneSeqSize;
    }

    /* save reps for next block */
    {
      U32 i;
      for (i = 0; i < ZSTD_REP_NUM; i++)
        dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]);
    }
  }

  /* last literal segment */
  {
    size_t const lastLLSize = litEnd - litPtr;
    if (lastLLSize > (size_t)(oend - op))
      return ERROR(dstSize_tooSmall);
    memcpy(op, litPtr, lastLLSize);
    op += lastLLSize;
  }

  return op - ostart;
}

static size_t ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart,
    size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset)
{
  return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
}
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */

#if DYNAMIC_BMI2

#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
static TARGET_ATTRIBUTE("bmi2") size_t ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
    const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset)
{
  return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
}
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */

#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
static TARGET_ATTRIBUTE("bmi2") size_t ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
    const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset)
{
  return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
}
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */

#endif /* DYNAMIC_BMI2 */

typedef size_t (*ZSTD_decompressSequences_t)(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart,
    size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset);

#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
static size_t ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart,
    size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset)
{
  DEBUGLOG(5, "ZSTD_decompressSequences");
#if DYNAMIC_BMI2
  if (dctx->bmi2) {
    return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
  }
#endif
  return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
}
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */

#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
/* ZSTD_decompressSequencesLong() :
 * decompression function triggered when a minimum share of offsets is considered "long",
 * aka out of cache.
 * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes
 * mearning "farther than memory cache distance".
 * This function will try to mitigate main memory latency through the use of prefetching */
static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart,
    size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset)
{
  DEBUGLOG(5, "ZSTD_decompressSequencesLong");
#if DYNAMIC_BMI2
  if (dctx->bmi2) {
    return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
  }
#endif
  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
}
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */

#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
/* ZSTD_getLongOffsetsShare() :
 * condition : offTable must be valid
 * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
 *           compared to maximum possible of (1<<OffFSELog) */
static unsigned ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
{
  const void* ptr = offTable;
  U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
  const ZSTD_seqSymbol* table = offTable + 1;
  U32 const max = 1 << tableLog;
  U32 u, total = 0;
  DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);

  assert(max <= (1 << OffFSELog)); /* max not too large */
  for (u = 0; u < max; u++) {
    if (table[u].nbAdditionalBits > 22)
      total += 1;
  }

  assert(tableLog <= OffFSELog);
  total <<= (OffFSELog - tableLog); /* scale to OffFSELog */

  return total;
}
#endif

size_t ZSTD_decompressBlock_internal(
    ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const int frame)
{ /* blockType == blockCompressed */
  const BYTE* ip = (const BYTE*)src;
  /* isLongOffset must be true if there are long offsets.
   * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
   * We don't expect that to be the case in 64-bit mode.
   * In block mode, window size is not known, so we have to be conservative.
   * (note: but it could be evaluated from current-lowLimit)
   */
  ZSTD_longOffset_e const isLongOffset =
      (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
  DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);

  if (srcSize >= ZSTD_BLOCKSIZE_MAX)
    return ERROR(srcSize_wrong);

  /* Decode literals section */
  {
    size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
    DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
    if (ZSTD_isError(litCSize))
      return litCSize;
    ip += litCSize;
    srcSize -= litCSize;
  }

  /* Build Decoding Tables */
  {
    /* These macros control at build-time which decompressor implementation
     * we use. If neither is defined, we do some inspection and dispatch at
     * runtime.
     */
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
    int usePrefetchDecoder = dctx->ddictIsCold;
#endif
    int nbSeq;
    size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
    if (ZSTD_isError(seqHSize))
      return seqHSize;
    ip += seqHSize;
    srcSize -= seqHSize;

#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
    if (!usePrefetchDecoder && (!frame || (dctx->fParams.windowSize > (1 << 24))) &&
        (nbSeq > ADVANCED_SEQS)) { /* could probably use a larger nbSeq limit */
      U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
      U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
      usePrefetchDecoder = (shareLongOffsets >= minShare);
    }
#endif

    dctx->ddictIsCold = 0;

#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
    if (usePrefetchDecoder)
#endif
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
      return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
#endif

#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
    /* else */
    return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
#endif
  }
}

size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
{
  size_t dSize;
  ZSTD_checkContinuity(dctx, dst);
  dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
  dctx->previousDstEnd = (char*)dst + dSize;
  return dSize;
}
